7.) Simulation by R

library(ggplot2)
library(plotly)

a.)

x <- seq(1, 100)
x
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100

b.)

w <- 2 + 0.5 * x
w
##   [1]  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0  7.5  8.0  8.5  9.0  9.5
##  [16] 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 16.0 16.5 17.0
##  [31] 17.5 18.0 18.5 19.0 19.5 20.0 20.5 21.0 21.5 22.0 22.5 23.0 23.5 24.0 24.5
##  [46] 25.0 25.5 26.0 26.5 27.0 27.5 28.0 28.5 29.0 29.5 30.0 30.5 31.0 31.5 32.0
##  [61] 32.5 33.0 33.5 34.0 34.5 35.0 35.5 36.0 36.5 37.0 37.5 38.0 38.5 39.0 39.5
##  [76] 40.0 40.5 41.0 41.5 42.0 42.5 43.0 43.5 44.0 44.5 45.0 45.5 46.0 46.5 47.0
##  [91] 47.5 48.0 48.5 49.0 49.5 50.0 50.5 51.0 51.5 52.0

c.)

set.seed(123)
myNormVec <- rnorm(100, sd = 5)
cat("The mean is ", mean(myNormVec), "\n")
## The mean is  0.4520295
cat("The standard deviation is ", sd(myNormVec), "\n")
## The standard deviation is  4.564079
Norm.df <- data.frame(x)
myNormDist <- as.data.frame(myNormVec)

ggplotly(ggplot(data = myNormDist, aes(x = myNormVec)) + geom_histogram(bins = 15, fill = "#8a5789"))

We notice that even though we create a normal distribution with defined mean and standard deviation, we observe a differing sample mean and standard deviation. Observing our histogram, we also can see that our distribution is not perfectly symmetric.

d.)

y <- myNormDist + w
head(y, 15)
##     myNormVec
## 1  -0.3023782
## 2   1.8491126
## 3  11.2935416
## 4   4.3525420
## 5   5.1464387
## 6  13.5753249
## 7   7.8045810
## 8  -0.3253062
## 9   3.0657357
## 10  4.7716901
## 11 13.6204090
## 12  9.7990691
## 13 10.5038573
## 14  9.5534136
## 15  6.7207943

e.)

Norm.df["y"] <- y
names(Norm.df) <- c("x", "y")
head(Norm.df)
##   x          y
## 1 1 -0.3023782
## 2 2  1.8491126
## 3 3 11.2935416
## 4 4  4.3525420
## 5 5  5.1464387
## 6 6 13.5753249
# Base size for labels is 11
# https://ggplot2.tidyverse.org/reference/ggtheme.html
ggplotly(
  ggplot(data = Norm.df, aes(x = x, y = y)) +
    geom_point(color = "#7eb6ff") +
    ggtitle("Y Versus X") +
    theme_grey(base_size = 11 * 1.5)
)

f.)

Beta1 <-
  sum((Norm.df[, 1] - mean(Norm.df[, 1])) * (Norm.df[, 2] - mean(Norm.df[, 2]))) / sum((Norm.df[, 1] - mean(Norm.df[, 1])) ^2)

Beta0 <- mean(Norm.df[, 2]) - Beta1 * mean(Norm.df[, 1])

# Create column for predictions
Norm.df["Ypred"] <- Beta0 + Beta1 * Norm.df["x"]


ggplotly(
  ggplot(data = Norm.df, aes(x = x, y = y)) +
    geom_point(color = "#7eb6ff") +
    geom_line(data = Norm.df, aes(x = x, y = Ypred)) +
    ggtitle("Y Versus X") +
    theme_grey(base_size = 11 * 1.5)
)

We can see that the least squares line is a strong fit to our data. We notice that the relationship between Y and X is highly linear and our least squares line neatly describes the trend of the data. g.)

Norm.df["ei"] <- Norm.df["y"] - Norm.df["Ypred"]

ggplotly(
  ggplot(data = Norm.df, aes(x = x, y = ei)) +
    geom_point(color = "#75a876") +
    ggtitle("Residuals Versus X") +
    theme_grey(base_size = 11 * 1.5)
)
cat("The MSE is", sum(Norm.df["ei"] ^ 2) / (nrow(Norm.df) - 2))
## The MSE is 20.90935

h.)

#Empty lists for plots
plotList <- list()
plotList2 <- list()

for (i in seq(1, 5)) {
  set.seed(i)
  xLoop <- x #rnorm(100, sd = 5)
  yLoop <- rnorm(100, sd = 5) + w
  
  Beta1Loop <-
    sum((xLoop - mean(xLoop)) * (yLoop - mean(yLoop))) / sum((xLoop - mean(xLoop)) ^
                                                               2)
  Beta0Loop <- mean(yLoop) - Beta1 * mean(xLoop)
  
  
  Loop.df <- data.frame(xLoop, yLoop, Beta0Loop + Beta1Loop * xLoop)
  names(Loop.df) <- c("x", "y", "Ypred")
  
  #Creating residuals column
  Loop.df["ei"] <- Loop.df["y"] - Loop.df["Ypred"]
  
  #Plots for y versus x
  p <- ggplotly(
    ggplot(data = Loop.df, aes(x = x, y = y)) +
      geom_point(color = "#7eb6ff") +
      geom_line(data = Loop.df, aes(x = x, y = Ypred)) +
      ggtitle("Y Versus X") +
      theme_grey(base_size = 11 * 1.5)
  )
  plotList[[i]] <- p
  #figList <- c(figList, Loop.df)
  
  #Plots for residuals versus X
  p2 <- ggplotly(
    ggplot(data = Loop.df, aes(x = x, y = ei)) +
      geom_point(color = "#75a876") +
      ggtitle("Residuals Versus X") +
      theme_grey(base_size = 11 * 1.5)
  )
  plotList2[[i]] <- p2
}


for (i in 1:5) {
  print(plotList[[i]])
  print(plotList2[[i]])
}

From the plots we can notice that the slop of the line changes between positive and negative while also being very close to zero. There is no repetitive pattern of the same observed line with each simulation. In regards to our residual plots, we can see random disbursement around our horizontal axis, with no signs of patterns or outliers. \

i.) (Optional Problem)

dfOpt <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(dfOpt) <- c("Beta0", "Beta1", "MSE")

for (i in 1:1000) {
  set.seed(i + 100)
  xOpt <- x #rnorm(100, sd = 5)
  
  yOpt <- rnorm(100, sd = 5) + w
  B1opt <-
    sum((xOpt - mean(xOpt)) * (yOpt - mean(yOpt))) / sum((yOpt - mean(yOpt)) ^
                                                           2)
  B0opt <- mean(yOpt) - B1opt * mean(xOpt)
  YpredOpt <- B0opt + B1opt * xOpt
  MSEopt <- sum((yOpt - YpredOpt) ^ 2) / length(xOpt) - 2
  
  #dfOpt <- rbind(dfOpt, c(B0opt, B1opt, MSEopt))
  dfOpt[i, ] <- c(B0opt, B1opt, MSEopt)
  
}

head(dfOpt)
##       Beta0    Beta1      MSE
## 1 -62.78229 1.779135 1358.026
## 2 -67.47688 1.886618 1722.037
## 3 -64.14851 1.810801 1454.264
## 4 -62.56227 1.785080 1366.427
## 5 -63.62236 1.798051 1391.694
## 6 -65.18540 1.827124 1504.575
ggplotly(ggplot(data = dfOpt, aes(x = Beta0)) + geom_histogram(fill = "#4b0082"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cat("The mean of Beta0 is",
    mean(dfOpt[, 1]),
    "and the variance is",
    var(dfOpt[, 1]),
    ".")
## The mean of Beta0 is -63.1689 and the variance is 7.983073 .
ggplotly(ggplot(data = dfOpt, aes(x = Beta1)) + geom_histogram(fill = "#00824b"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cat("The mean of Beta1 is",
    mean(dfOpt[, 2]),
    "and the variance is",
    var(dfOpt[, 2]),
    ".")
## The mean of Beta1 is 1.790269 and the variance is 0.003053532 .
ggplotly(ggplot(data = dfOpt, aes(x = MSE)) + geom_histogram(fill = "#824b00"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cat("The mean of MSE is",
    mean(dfOpt[, 3]),
    "and the variance is",
    var(dfOpt[, 3]),
    ".")
## The mean of MSE is 1414.112 and the variance is 22906.62 .

We can see from the the simulation of a thousand samples that our distributions for both \(\beta_0\) and \(\beta_1\) are approximately normal, The distribution of our MSE is also approximately normal. The means of all estimators approach the true value (which we define by the simulation). Between the two least squares coefficients, \(\beta_0\) has less spread than \(\beta_1\).